IMPORT

1. Librairies

library(phyloseq) # for phyloseq object
library(ggplot2)
library(ggsignif)
library(RColorBrewer)
library(cowplot)
library(plyr)
library(dplyr)
library("plotly") # plot 3D
library("microbiome") # for centered log-ratio
library("coda") # Aitchison distance
library("coda.base") # Aitchison distance
library("vegan") # NMDS
library(pheatmap) # for heatmap

2. Data

# Set path
path <- "~/Projects/IBS_Meta-analysis_16S"
path.plots <- file.path(path, "data/analysis-individual/PLOTS/plots-Pozuelo-EDA")

# Import phyloseq object
physeq.pozuelo <- readRDS(file.path(path, "phyloseq-objects/physeq_pozuelo.rds"))

# Sanity check
physeq.pozuelo
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 22206 taxa and 273 samples ]
## sample_data() Sample Data:       [ 273 samples by 10 sample variables ]
## tax_table()   Taxonomy Table:    [ 22206 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 22206 tips and 22204 internal nodes ]
## refseq()      DNAStringSet:      [ 22206 reference sequences ]

Phylogenetic tree was computed with the package phangorn, and the script was run on a cluster. Let’s check we have correctly generated a phylogenetic tree.

# Look at the tree
plot_tree(physeq.pozuelo, color = "Phylum", ladderize="left")

DEMOGRAPHICS

This dataset has covariates on collection date. Some individuals had two collection time points. Let’s verify the distribution is similar between healthy & IBS groups.

# Number of individuals in each group
metadata <- data.frame(sample_data(physeq.pozuelo))
metadata %>%
  count(host_disease)
# Collection date
df <- metadata %>%
  group_by(collection_date, host_disease) %>%
  summarize(count=n())
ggplot(df, aes(x = collection_date, y = count)) + 
  geom_point(aes(color = host_disease), size = 2) +
  scale_color_manual(values = c("#00AFBB", "#E7B800")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle=90)) +
  labs(x="", y="# samples collected")

# Double stool collection
metadata %>%
  group_by(Collection, host_disease) %>%
  summarize(count=n())
# to note, only 1/3 of healthy individuals provided 2 stool samples, while 2/3 of IBS individuals provided 2 stool samples

ABUNDANCES

1. Absolute abundances

# Plot Phylum
plot_bar(physeq.pozuelo, fill = "Phylum") + facet_wrap("host_disease", scales="free") +
  theme(axis.text.x = element_blank())+
  labs(x = "Samples", y = "Absolute abundance", title = "Pozuelo dataset (2015)") +
  ylim(0,80000)

# ggsave(file.path(path.plots, "absAbundance_phylum.jpg"), width=13, height=5)

# Plot Class
plot_bar(physeq.pozuelo, fill = "Class")+ facet_wrap("host_disease", scales="free") +
  theme(axis.text.x = element_blank())+
  labs(x = "Samples", y = "Absolute abundance", title = "Pozuelo dataset (2015)")+
  ylim(0,80000)

Sequencing depth characteristics of the Pozuelo dataset:
- minimum of 1.528810^{4} total count per sample
- median: 4.083210^{4} total count per sample
- maximum of 7.724810^{4} total count per sample

2. Relative abundances

# Agglomerate to phylum & class levels
phylum.table <- physeq.pozuelo %>%
  tax_glom(taxrank = "Phylum") %>%                     # agglomerate at phylum level
  transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
  psmelt()                                             # Melt to long format

class.table <- physeq.pozuelo %>%
  tax_glom(taxrank = "Class") %>%
  transform_sample_counts(function(x) {x/sum(x)} ) %>%
  psmelt()


# Plot relative abundances
ggplot(phylum.table, aes(x = reorder(Sample, Sample, function(x) mean(phylum.table[Sample == x & Phylum == 'Firmicutes', 'Abundance'])),
                         y = Abundance, fill = Phylum))+
  facet_wrap(~ host_disease, scales = "free_x") + # scales = "free" removes empty lines
  geom_bar(stat = "identity", width=1) +
  theme(axis.text.x = element_blank())+
  labs(x = "Samples", y = "Relative abundance", title = "Pozuelo dataset (2015)")

# ggsave(file.path(path.plots, "relAbundance_phylum.jpg"), width=10, height=5)

ggplot(class.table, aes(x = reorder(Sample, Sample, function(x) mean(class.table[Sample == x & Phylum == 'Firmicutes', 'Abundance'])),
                        y = Abundance, fill = Class))+
  facet_wrap(~ host_disease, scales = "free_x") +
  geom_bar(stat = "identity", width=1) +
  theme(axis.text.x = element_blank())+
  labs(x = "Samples", y = "Relative abundance", title = "Pozuelo dataset (2015)")

# ggsave(file.path(path.plots, "relAbundance_class.jpg"), width=12, height=5)
# Plot relative abundance of paired samples (plot both collection times side-by-side)
ggplot(phylum.table %>% group_by(host_ID) %>% filter(n_distinct(Run)>1),
       aes(x=Collection, y=Abundance, fill=Phylum))+
  facet_wrap(~host_ID, scales="free_x", nrow=1, strip.position="bottom")+
  geom_bar(stat="identity", width=1)+
  scale_y_continuous(expand = c(0, 0))+
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        strip.text = element_text(angle=90),
        strip.background = element_rect(fill="white"))+
  labs(x = "Individual", y = "Relative abundance", title = "Relative abundance of phyla at both collection time points (left t0, right t0+1month)")

# ggsave(file.path(path.plots, "relAbundance_phylum_collection-times.jpg"), width=19, height=7)

3. Firmicutes/Bacteroidota ratio

# Extract abundance of only Bacteroidota and Firmicutes
bacter <- phylum.table %>%
  filter(Phylum == "Bacteroidota") %>%
  select(c('Sample', 'Abundance', 'host_disease', 'host_subtype', 'Phylum', 'Collection', 'host_ID')) %>%
  arrange(Sample)

firmi <- phylum.table %>%
  filter(Phylum == "Firmicutes") %>%
  select(c('Sample', 'Abundance', 'host_disease', 'host_subtype', 'Phylum', 'Collection', 'host_ID')) %>%
  arrange(Sample)

# Calculate log2 ratio Firmicutes/Bacteroidota
ratio.FB <- data.frame('Sample' = bacter$Sample,
                       'host_ID' = bacter$host_ID,
                       'host_disease' = bacter$host_disease,
                       'host_subtype' = bacter$host_subtype,
                       'Bacteroidota' = bacter$Abundance,
                       'Firmicutes' = firmi$Abundance,
                       'Collection' = bacter$Collection)
ratio.FB$logRatioFB <- log2(ratio.FB$Firmicutes / ratio.FB$Bacteroidota)

# Plot log2 ratio Firmicutes/Bacteroidota
ggplot(ratio.FB, aes(x = host_disease, y = logRatioFB))+
  geom_violin(aes(fill=host_disease))+
  scale_fill_manual(values=scales::alpha(c("blue", "red"), .3))+
  geom_jitter(width=0.1)+
  geom_signif(comparisons = list(c("Healthy", "IBS")), map_signif_level = TRUE, test="wilcox.test") +
  labs(x = "",  y = 'Log2(Firmicutes/Bacteroidota)', title = "Both collection time points")+
  theme_cowplot()+
  theme(legend.position="none")

# ggsave(file.path(path.plots, "ratioFB.jpg"), width=5, height=7)
wilcox.test(ratio.FB[ratio.FB$host_disease == "IBS","logRatioFB"],
            ratio.FB[ratio.FB$host_disease == "Healthy","logRatioFB"]) # p = 0.0001
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  ratio.FB[ratio.FB$host_disease == "IBS", "logRatioFB"] and ratio.FB[ratio.FB$host_disease == "Healthy", "logRatioFB"]
## W = 5832, p-value = 0.0001539
## alternative hypothesis: true location shift is not equal to 0
# Plot log2 ratio Firmicutes/Bacteroidota per IBS subtype
ggplot(ratio.FB, aes(x = host_subtype, y = logRatioFB))+
  facet_wrap(~Collection) +
  geom_violin(aes(fill=host_disease))+
  scale_fill_manual(values=scales::alpha(c("blue", "red"), .3))+
  geom_jitter(width=0.1)+
  geom_signif(comparisons = list(c("HC", "IBS-C"), c("HC", "IBS-D"), c("HC", "IBS-M")),
              map_signif_level = TRUE, test="wilcox.test", step_increase=0.1) +
  labs(x = "",  y = 'Log2(Firmicutes/Bacteroidota)', title = "Data separated by collection time point")+
  theme_cowplot() +
  theme(legend.position="none")

# ggsave(file.path(path.plots, "ratioFB_subtype.jpg"), width=6, height=6)

# Statistical test HC vs IBS-C
wilcox.test(ratio.FB[ratio.FB$Collection == "1st" & ratio.FB$host_subtype == "HC","logRatioFB"],
            ratio.FB[ratio.FB$Collection == "1st" & ratio.FB$host_subtype == "IBS-C","logRatioFB"]) # p = 0.12
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  ratio.FB[ratio.FB$Collection == "1st" & ratio.FB$host_subtype == "HC", "logRatioFB"] and ratio.FB[ratio.FB$Collection == "1st" & ratio.FB$host_subtype == "IBS-C", "logRatioFB"]
## W = 1260, p-value = 0.1232
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(ratio.FB[ratio.FB$Collection == "2nd" & ratio.FB$host_subtype == "HC","logRatioFB"],
            ratio.FB[ratio.FB$Collection == "2nd" & ratio.FB$host_subtype == "IBS-C","logRatioFB"]) # p = 0.03
## 
##  Wilcoxon rank sum exact test
## 
## data:  ratio.FB[ratio.FB$Collection == "2nd" & ratio.FB$host_subtype == "HC", "logRatioFB"] and ratio.FB[ratio.FB$Collection == "2nd" & ratio.FB$host_subtype == "IBS-C", "logRatioFB"]
## W = 390, p-value = 0.03132
## alternative hypothesis: true location shift is not equal to 0
# Statistical test HC vs IBS-D
wilcox.test(ratio.FB[ratio.FB$Collection == "1st" & ratio.FB$host_subtype == "HC","logRatioFB"],
            ratio.FB[ratio.FB$Collection == "1st" & ratio.FB$host_subtype == "IBS-D","logRatioFB"]) # p = 0.01
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  ratio.FB[ratio.FB$Collection == "1st" & ratio.FB$host_subtype == "HC", "logRatioFB"] and ratio.FB[ratio.FB$Collection == "1st" & ratio.FB$host_subtype == "IBS-D", "logRatioFB"]
## W = 2260, p-value = 0.01177
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(ratio.FB[ratio.FB$Collection == "2nd" & ratio.FB$host_subtype == "HC","logRatioFB"],
            ratio.FB[ratio.FB$Collection == "2nd" & ratio.FB$host_subtype == "IBS-D","logRatioFB"]) # p = 0.004
## 
##  Wilcoxon rank sum exact test
## 
## data:  ratio.FB[ratio.FB$Collection == "2nd" & ratio.FB$host_subtype == "HC", "logRatioFB"] and ratio.FB[ratio.FB$Collection == "2nd" & ratio.FB$host_subtype == "IBS-D", "logRatioFB"]
## W = 528, p-value = 0.004091
## alternative hypothesis: true location shift is not equal to 0
# Statistical test HC vs IBS-M
wilcox.test(ratio.FB[ratio.FB$Collection == "1st" & ratio.FB$host_subtype == "HC","logRatioFB"],
            ratio.FB[ratio.FB$Collection == "1st" & ratio.FB$host_subtype == "IBS-M","logRatioFB"]) # p = 0.08
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  ratio.FB[ratio.FB$Collection == "1st" & ratio.FB$host_subtype == "HC", "logRatioFB"] and ratio.FB[ratio.FB$Collection == "1st" & ratio.FB$host_subtype == "IBS-M", "logRatioFB"]
## W = 1100, p-value = 0.07761
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(ratio.FB[ratio.FB$Collection == "2nd" & ratio.FB$host_subtype == "HC","logRatioFB"],
            ratio.FB[ratio.FB$Collection == "2nd" & ratio.FB$host_subtype == "IBS-M","logRatioFB"]) # p = 0.06
## 
##  Wilcoxon rank sum exact test
## 
## data:  ratio.FB[ratio.FB$Collection == "2nd" & ratio.FB$host_subtype == "HC", "logRatioFB"] and ratio.FB[ratio.FB$Collection == "2nd" & ratio.FB$host_subtype == "IBS-M", "logRatioFB"]
## W = 199, p-value = 0.05738
## alternative hypothesis: true location shift is not equal to 0
# Plot paired data
ggplot(ratio.FB, aes(x = Collection, y = logRatioFB))+
  geom_violin(aes(fill=host_disease))+
  scale_fill_manual(values=scales::alpha(c("blue", "red"), .3))+
  geom_point()+
  geom_line(aes(group=host_ID), lwd=0.1) +
  facet_wrap(~host_disease) +
  geom_signif(comparisons = list(c("1st", "2nd")),
              map_signif_level = TRUE, test="wilcox.test") +
  labs(x = "Collection time point",  y = 'Log2(Firmicutes/Bacteroidota)', title = "Paired data") +
  theme_cowplot()+
  theme(legend.position="none")

# ggsave(file.path(path.plots, "ratioFB_collection-time.jpg"), width=5, height=7)

ggplot(ratio.FB, aes(x = host_disease, y = logRatioFB))+
  geom_violin(aes(fill=host_disease))+
  scale_fill_manual(values=scales::alpha(c("blue", "red"), .3))+
  geom_jitter(width=0.1)+
  facet_wrap(~Collection) +
  geom_signif(comparisons = list(c("Healthy", "IBS")),
              map_signif_level = TRUE, test="wilcox.test") +
  labs(x = "",  y = 'Log2(Firmicutes/Bacteroidota)', title = "Collection time point (t0, t0+1mo)") +
  theme_cowplot()+
  theme(legend.position="none")

# ggsave(file.path(path.plots, "ratioFB_collection-time2.jpg"), width=5, height=7)

NORMALIZE DATA

# Sanity check no sample with less than 500 total count
table(sample_sums(physeq.pozuelo)<500) # all FALSE

#____________________________________________________________________
# PHYLOSEQ OBJECT WITH NON-ZERO COMPOSITIONS
physeq.NZcomp <- physeq.pozuelo
otu_table(physeq.NZcomp)[otu_table(physeq.NZcomp) == 0] <- 0.5 # pseudocounts

# Sanity check that 0 values have been replaced
# otu_table(physeq.pozuelo)[1:5,1:5]
# otu_table(physeq.NZcomp)[1:5,1:5]

# transform into compositions
physeq.NZcomp <- transform_sample_counts(physeq.NZcomp, function(x) x / sum(x) )
table(rowSums(otu_table(physeq.NZcomp))) # check if there is any row not summing to 1

# Save object
saveRDS(physeq.NZcomp, file.path(path, "data/analysis-individual/Pozuelo-2015/02_EDA-Pozuelo/physeq_NZcomp.rds"))

#____________________________________________________________________
# PHYLOSEQ OBJECT WITH RELATIVE COUNT (BETWEEN 0 AND 1)
physeq.rel <- physeq.pozuelo
physeq.rel <- transform_sample_counts(physeq.rel, function(x) x / sum(x) )

# check the counts are all relative
# otu_table(physeq.pozuelo)[1:5, 1:5]
# otu_table(physeq.rel)[1:5, 1:5]

# sanity check
table(rowSums(otu_table(physeq.rel))) # check if there is any row not summing to 1

# save the physeq.rel object
saveRDS(physeq.rel, file.path(path, "data/analysis-individual/Pozuelo-2015/02_EDA-Pozuelo/physeq_relative.rds"))

#____________________________________________________________________
# PHYLOSEQ OBJECT WITH COMMON-SCALE NORMALIZATION
physeq.CSN <- physeq.pozuelo
physeq.CSN <- transform_sample_counts(physeq.CSN, function(x) (x*min(sample_sums(physeq.CSN))) / sum(x) )

# sanity check
table(rowSums(otu_table(physeq.CSN))) # check that all rows are summing to the same total

# save the physeq.CSN object
saveRDS(physeq.CSN, file.path(path, "data/analysis-individual/Pozuelo-2015/02_EDA-Pozuelo/physeq_CSN.rds"))


#____________________________________________________________________
# PHYLOSEQ OBJECT WITH CENTERED LOG RATIO COUNT
physeq.clr <- physeq.pozuelo
physeq.clr <- microbiome::transform(physeq.pozuelo, "clr") # the function adds pseudocounts itself

# Compare the otu tables in the original phyloseq object and the new one after CLR transformation
otu_table(physeq.pozuelo)[1:5, 1:5] # should contain absolute counts
otu_table(physeq.clr)[1:5, 1:5] # should all be relative

# save the physeq.rel object
saveRDS(physeq.clr, file.path(path, "data/analysis-individual/Pozuelo-2015/02_EDA-Pozuelo/physeq_clr.rds"))

COMPUTE DISTANCES

1. UniFrac, Aitchison, Bray-Curtis and Canberra

First, let’s look at these four distances of interest.

#____________________________________________________________________________________
# Measure distances
getDistances <- function(relPhyseq=physeq.rel, clrPhyseq=physeq.clr, csnPhyseq=physeq.CSN, nzcompPhyseq=physeq.NZcomp){
  # sanity check
  cat("nb samples relPhyseq:", nsamples(relPhyseq), "\n")
  cat("nb samples clrPhyseq:", nsamples(clrPhyseq), "\n")
  cat("nb samples csnPhyseq:", nsamples(csnPhyseq), "\n")
  cat("nb samples nzcompPhyseq:", nsamples(nzcompPhyseq), "\n")
  
  # Compute distances
  print("Unifrac...")
  set.seed(123) # for unifrac, need to set a seed
  glom.UniF <- UniFrac(relPhyseq, weighted=TRUE, normalized=TRUE) # weighted unifrac
  print("Aitchison...")
  glom.ait <- phyloseq::distance(clrPhyseq, method = 'euclidean') # aitchison
  print("Bray des bois...")
  glom.bray <- phyloseq::distance(csnPhyseq, method = "bray") # bray-curtis
  print("Canberra <3...")
  glom.can <- phyloseq::distance(nzcompPhyseq, method = "canberra") # canberra
  
  dist.list <- list("UniF" = glom.UniF, "Ait" = glom.ait, "Canb" = glom.can, "Bray" = glom.bray)
  
  return(dist.list)
}


#____________________________________________________________________________________
# Plot in 2D the distances
plotDistances2D <- function(dlist, ordination="MDS", relPhyseq=physeq.rel, clrPhyseq=physeq.clr, csnPhyseq=physeq.CSN, nzcompPhyseq=physeq.NZcomp){
  plist <- NULL
  plist <- vector("list", 4)
  names(plist) <- c("Weighted Unifrac", "Aitchison", "Bray-Curtis", "Canberra")
  
  print("Unifrac")
  # Weighted UniFrac
  set.seed(123)
  iMDS.UniF <- ordinate(relPhyseq, ordination, distance=dlist$UniF)
  plist[[1]] <- plot_ordination(relPhyseq, iMDS.UniF, color="host_disease")
  
  print("Aitchison")
  # Aitchison
  set.seed(123)
  iMDS.Ait <- ordinate(clrPhyseq, ordination, distance=dlist$Ait)
  plist[[2]] <- plot_ordination(clrPhyseq, iMDS.Ait, color="host_disease")
  
  print("Bray")
  # Bray-Curtis
  set.seed(123)
  iMDS.Bray <- ordinate(csnPhyseq, ordination, distance=dlist$Bray)
  plist[[3]] <- plot_ordination(csnPhyseq, iMDS.Bray, color="host_disease")
  
  print("Canberra")
  # Canberra
  set.seed(123)
  iMDS.Can <- ordinate(nzcompPhyseq, ordination, distance=dlist$Can)
  plist[[4]] <- plot_ordination(nzcompPhyseq, iMDS.Can, color="host_disease")
  
  # Creating a dataframe to plot everything
  plot.df = plyr::ldply(plist, function(x) x$data)
  names(plot.df)[1] <- "distance"
  
  return(plot.df)
}

Now let’s plot!

a) All samples (both time points)

# Get the distances (both collection time points) & the plot data
dist.pozuelo <- getDistances()
## nb samples relPhyseq: 273 
## nb samples clrPhyseq: 273 
## nb samples csnPhyseq: 273 
## nb samples nzcompPhyseq: 273 
## [1] "Unifrac..."
## [1] "Aitchison..."
## [1] "Bray des bois..."
## [1] "Canberra <3..."
plot.df <- plotDistances2D(dist.pozuelo)
## [1] "Unifrac"
## [1] "Aitchison"
## [1] "Bray"
## [1] "Canberra"
# Plot
a <- ggplot(plot.df, aes(Axis.1, Axis.2, color=host_disease))+
  geom_point(size=4, alpha=0.5)  + scale_color_manual(values = c('blue', 'red'))+
  facet_wrap(distance~., scales='free', nrow=1)+
  theme_bw()+
  theme(strip.text.x = element_text(size=20),
        axis.title.x = element_blank())+
  labs(color="Disease")
b <- ggplot(plot.df, aes(Axis.1, Axis.2, color=Collection))+
  geom_line(aes(group=host_ID), color="black", lwd=0.1)+
  geom_point(size=4, alpha=0.5)  + scale_color_manual(values = c('#fdbe85', '#e6550d'))+
  facet_wrap(distance~., scales='free', nrow=1)+
  theme_bw()+
  theme(strip.text.x = element_blank(),
        axis.title.x = element_blank())+
  labs(color="Time point")
c <- ggplot(plot.df, aes(Axis.1, Axis.2, color=host_subtype))+
  geom_point(size=4, alpha=0.5)  + scale_color_manual(values = scales::alpha(c("blue", "brown", "red", "orange"), .3))+
  facet_wrap(distance~., scales='free', nrow=1)+
  theme_bw()+
  theme(strip.text.x = element_blank())+
  labs(color="Subtype")

ggpubr::ggarrange(a,b,c, nrow=3)

# ggsave(file.path(path.plots, "distances4_MDS_all.jpg"), height = 10, width = 15)

b) Each time point separately

#######
# 1st time point (n=179)
dist.pozuelo.1st <- getDistances(relPhyseq = subset_samples(physeq.rel, Collection=="1st"),
                                 clrPhyseq = subset_samples(physeq.clr, Collection=="1st"),
                                 csnPhyseq = subset_samples(physeq.CSN, Collection=="1st"),
                                 nzcompPhyseq = subset_samples(physeq.NZcomp, Collection=="1st"))
## nb samples relPhyseq: 179 
## nb samples clrPhyseq: 179 
## nb samples csnPhyseq: 179 
## nb samples nzcompPhyseq: 179 
## [1] "Unifrac..."
## [1] "Aitchison..."
## [1] "Bray des bois..."
## [1] "Canberra <3..."
plot.df.1st <- plotDistances2D(dlist = dist.pozuelo.1st, relPhyseq = subset_samples(physeq.rel, Collection=="1st"),
                                                         clrPhyseq = subset_samples(physeq.clr, Collection=="1st"),
                                                         csnPhyseq = subset_samples(physeq.CSN, Collection=="1st"),
                                                         nzcompPhyseq = subset_samples(physeq.NZcomp, Collection=="1st"))
## [1] "Unifrac"
## [1] "Aitchison"
## [1] "Bray"
## [1] "Canberra"
p1 <- ggplot(plot.df.1st, aes(Axis.1, Axis.2, color=host_disease))+
  geom_point(size=4, alpha=0.5)  + scale_color_manual(values = c('blue', 'red'))+
  facet_wrap(distance~., scales='free', nrow=1)+
  theme_bw()+
  theme(strip.text.x = element_text(size=20),
        axis.title.x = element_blank())+
  labs(color="Disease", title="Time 0")
p2 <- ggplot(plot.df.1st, aes(Axis.1, Axis.2, color=host_subtype))+
  geom_point(size=4, alpha=0.5)  + scale_color_manual(values = scales::alpha(c("blue", "brown", "red", "orange"), .3))+
  facet_wrap(distance~., scales='free', nrow=1)+
  theme_bw()+
  theme(strip.text.x = element_blank())+
  labs(color="Subtype")

ggpubr::ggarrange(p1,p2, nrow=2)

# ggsave(file.path(path.plots, "distances4_MDS_1stCollection.jpg"), height = 7, width = 15)


#######
# 2nd time point (n=94)
dist.pozuelo.2nd <- getDistances(relPhyseq = subset_samples(physeq.rel, Collection=="2nd"),
                                 clrPhyseq = subset_samples(physeq.clr, Collection=="2nd"),
                                 csnPhyseq = subset_samples(physeq.CSN, Collection=="2nd"),
                                 nzcompPhyseq = subset_samples(physeq.NZcomp, Collection=="2nd"))
## nb samples relPhyseq: 94 
## nb samples clrPhyseq: 94 
## nb samples csnPhyseq: 94 
## nb samples nzcompPhyseq: 94 
## [1] "Unifrac..."
## [1] "Aitchison..."
## [1] "Bray des bois..."
## [1] "Canberra <3..."
plot.df.2nd <- plotDistances2D(dist.pozuelo.2nd, relPhyseq = subset_samples(physeq.rel, Collection=="2nd"),
                                                 clrPhyseq = subset_samples(physeq.clr, Collection=="2nd"),
                                                 csnPhyseq = subset_samples(physeq.CSN, Collection=="2nd"),
                                                 nzcompPhyseq = subset_samples(physeq.NZcomp, Collection=="2nd"))
## [1] "Unifrac"
## [1] "Aitchison"
## [1] "Bray"
## [1] "Canberra"
p3 <- ggplot(plot.df.2nd, aes(Axis.1, Axis.2, color=host_disease))+
  geom_point(size=4, alpha=0.5)  + scale_color_manual(values = c('blue', 'red'))+
  facet_wrap(distance~., scales='free', nrow=1)+
  theme_bw()+
  theme(strip.text.x = element_text(size=20),
        axis.title.x = element_blank())+
  labs(color="Disease", title="Time +1month")

p4 <- ggplot(plot.df.2nd, aes(Axis.1, Axis.2, color=host_subtype))+
  geom_point(size=4, alpha=0.5)  + scale_color_manual(values = scales::alpha(c("blue", "brown", "red", "orange"), .3))+
  facet_wrap(distance~., scales='free', nrow=1)+
  theme_bw()+
  theme(strip.text.x = element_blank())+
  labs(color="Subtype")

ggpubr::ggarrange(p3,p4, nrow=2)

# ggsave(file.path(path.plots, "distances4_MDS_2ndCollection.jpg"), height = 7, width = 15)

2. Plot in 3D

For better visualization, we will also take a glance at reduction to 3D.

#____________________________________________________________________________________
# Plot 3D ordination
plotDistances3D <- function(d, name_dist){
  
  # Reset parameters
  mds.3D <- NULL
  xyz <- NULL
  fig.3D <- NULL
  
  # Reduce distance matrix to 3 dimensions
  set.seed(123)
  mds.3D <- metaMDS(d, method="MDS", k=3, trace = 0)
  xyz <- scores(mds.3D, display="sites") # pull out the (x,y,z) coordinates
  
  # Plot
  fig.3D <- plot_ly(x=xyz[,1], y=xyz[,2], z=xyz[,3], type="scatter3d", mode="markers",
                    color=sample_data(physeq.pozuelo)$host_disease, colors = c("blue", "red"))%>%
    layout(title = paste('MDS in 3D with', name_dist, 'distance', sep = ' '))
  
  return(fig.3D)
}

Now let’s plot!

plotDistances3D(dist.pozuelo$UniF, "UniFrac")
plotDistances3D(dist.pozuelo$Ait, "Aitchison")
plotDistances3D(dist.pozuelo$Canb, "Canberra")
plotDistances3D(dist.pozuelo$Bray, "Bray-Curtis")

HIERARCHICAL CLUSTERING

# For heatmaps: have group color
matcol <- data.frame(group = sample_data(physeq.pozuelo)[,"host_disease"],
                     host_subtype = sample_data(physeq.pozuelo)[,"host_subtype"],
                     Collection = sample_data(physeq.pozuelo)[,"Collection"])


# Function to get heatmap from the distances computed
plotHeatmaps <- function(dlist, fontsize){
  
  # Initialize variables
  i=1
  plist <- vector("list", 4)
  names(plist) <- names(dlist)
  
  # Loop through distances
  for(d in dlist){
    plist[[i]] <- pheatmap(as.matrix(d), 
                          clustering_distance_rows = d,
                          clustering_distance_cols = d,
                          fontsize = fontsize,
                          fontsize_col = fontsize-5,
                          fontsize_row = fontsize-5,
                          annotation_col = matcol,
                          annotation_row = matcol,
                          annotation_colors = list(host_disease = c('Healthy' = 'blue', 'IBS' = 'red'),
                                                   host_subtype = c('HC'='black', 'IBS-M'='orange', 'IBS-C'='brown', 'IBS-D'='red'),
                                                   Collection = c("1st" = '#fdbe85', "2nd" = '#e6550d')),
                          cluster_rows = T,
                          cluster_cols = T,
                          clustering_method = 'complete', # hc method
                          main = names(dlist)[i]) # have name of distance as title
    i <- i+1
  }
  
  return(plist)
}


# Get the heatmaps
heatmp.pozuelo <- plotHeatmaps(dlist = dist.pozuelo, fontsize = 8)

REPRODUCE PLOTS FROM PAPER

From the methods section of the Pozuelo et al. paper: “To calculate between-sample diversity, weighted and unweighted Unifrac metrics were applied to build phylogenetic distance matrices, which were then used to construct hierarchical cluster trees using unweighted pair group method with arithmetic mean (UPGMA) and PCoA representations. Finally, distance-based redundancy analyses were performed using the eigenvalues obtained in the PCoA.”

Figure 1 - Redundancy analysis on unweighted UniFrac data

# Run PCoA
set.seed(123)
pcoa <- ordinate(subset_samples(physeq.rel, Collection=="1st"), "PCoA", distance="unifrac", weighted=FALSE)
# Get PCoA eigenvectors
pcoa.eig <- as.data.frame(pcoa$vectors)

# A - Run on healthy/IBS
# Run distance-based RDA (dbRDA), which is an RDA on PCoA eigenvectors
df <- data.frame(sample_data(subset_samples(physeq.rel, Collection=="1st"))[, c("host_disease", "host_subtype")])
dbRDA <- rda(pcoa.eig ~ host_disease, data=df)
# Plot dbRDA with host_disease
plt.df <- merge(data.frame(plot(dbRDA)$sites), df, by="row.names")

a <- ggplot(plt.df, aes(x=RDA1, y=PC1, color=host_disease))+
  geom_point()+
  scale_color_manual(values=c("#08519c", "#ef3b2c"), name="")+
  theme_cowplot()+
  labs(x="dbRDA1", y="dbRDA2")

# B - Run on IBS subtypes
dbRDA.2 <- rda(pcoa.eig ~ host_subtype, data=df)
plt.df.2 <- merge(data.frame(plot(dbRDA.2)$sites), df, by="row.names")

b <- ggplot(plt.df.2, aes(x=RDA1, y=RDA2, color=host_subtype))+
  geom_point()+
  scale_color_manual(values=c("#08519c", "#fd8d3c", "#fee391", "#ef3b2c"), name="")+
  theme_cowplot()+
  labs(x="dbRDA1", y="dbRDA2")

# Put both figures together
ggdraw() +
  draw_plot(a, x = 0, y = 0, width = .5, height = 1) +
  draw_plot(b, x = 0.5, y = 0, width = .5, height = 1) +
  draw_plot_label(label = c("A", "B"), size = ,
                  x = c(0, 0.5), y = c(1, 1))

Figure 2 - Higher relative abundance of Bacteroidetes

ggplot(phylum.table %>% filter(Phylum %in% c("Firmicutes", "Bacteroidota"), Collection=="1st"),
       aes(x = reorder(host_ID, host_ID, function(x) -mean(phylum.table[host_ID == x & Phylum == 'Firmicutes', 'Abundance'])),
           y = Abundance, color = Phylum))+
  facet_wrap(~ host_disease, scales = "free_x", nrow=2) + # scales = "free" removes empty lines
  geom_line(aes(group=Phylum), lwd=1)+
  scale_color_manual(values=c("#de2d26", "#2b8cbe"))+
  theme_cowplot()+
  theme(axis.text.x = element_text(size=4, angle=90))+
  labs(x = "Samples", y = "Relative abundance", title = "Figure 2")

Figure 3 - Higher relative abundance of two bacterial families

#_____________________________________
# FIGURE 3

#Families
family.table <- physeq.pozuelo %>%
  tax_glom(taxrank = "Family") %>%                     # agglomerate at family level
  transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
  psmelt()                                             # Melt to long format

ggplot(family.table[family.table[,"Family"] == "Erysipelotrichaceae",],
       aes(x = host_disease, y = Abundance))+
  geom_boxplot()+
  geom_jitter(width = 0.1)+
  labs(x = "",  y = 'Relative Abundance', title = 'Erysipelotrichaceae family')+
  theme_classic()+
  ylim(0,0.05)

ggplot(family.table[family.table[,"Family"] == "Ruminococcaceae",],
       aes(x = host_disease, y = Abundance))+
  geom_boxplot()+
  geom_jitter(width = 0.1)+
  labs(x = "",  y = 'Relative Abundance', title = 'Ruminococcaceae family')+
  theme_classic()

Figure 4 - Dysbiosis at the family and genus level in IBS subtypes

#_____________________________________
# FIGURE 4A & 4C
# on figure 4A, we can't plot the "unknown Clostridiales"
# on figure 4C, we didn't detect Verrucomicrobiaceae in the Pozuelo dataset
# we will plot the Methanobacteriaceae, Erysipelotrichaceae, Christensenellaceae and Ruminococcaceae by IBS subtype like in panels 4A and 4C
colors <- c("#756bb1", "#31a354", "#de2d26","#3182bd")
names(colors) <- c("Methanobacteriaceae", "Erysipelotrichaceae", "Christensenellaceae", "Ruminococcaceae")

ggplot(family.table %>%
         filter(Collection=="1st") %>%
         filter(Family %in% c("Methanobacteriaceae", "Erysipelotrichaceae", "Christensenellaceae", "Ruminococcaceae")),
       aes(x = host_subtype,
           y = Abundance, fill = Family))+
  geom_bar(stat = "identity") +
  scale_fill_manual(values=colors)+
  theme_cowplot()+
  theme(axis.text.x=element_text(angle=90))+
  labs(x = "", y = "Relative abundance (?)", title = "Fig4A & 4C")

#_____________________________________
# FIGURE 4B
ggplot(family.table %>%
         filter(Family == "Erysipelotrichaceae") %>%
         filter(host_subtype %in% c("HC", "IBS-M")) %>%
         filter(Collection == "1st"),
       aes(x = host_subtype, y = Abundance))+
  geom_boxplot()+
  geom_jitter(width = 0.1)+
  labs(x = "",  y = 'Relative Abundance', title = 'Erysipelotrichaceae family (fig4B)')+
  theme_cowplot()+
  ylim(0,0.03)